import numpy as np
import pandas as pd
Diagram of plant
weather_file, gen_file = 'solar_panel_data\\Plant_2_Weather_Sensor_Data.csv', 'solar_panel_data\\Plant_2_Generation_Data.csv'
gen_df = pd.read_csv(gen_file, parse_dates=[0]).dropna()
weather_df = pd.read_csv(weather_file, parse_dates=[0]).dropna()
# Both datasets are time-series, so we can merge them together for easier manipulation
source_df = gen_df.merge(weather_df, on=["DATE_TIME", "PLANT_ID", "SOURCE_KEY"], how="outer").sort_values("DATE_TIME")
source_df
# Rename verbose sensor names
source_ids = source_df["SOURCE_KEY"].drop_duplicates().reset_index()["SOURCE_KEY"].reset_index()\
.rename(columns={'index': 'source_id'})
# Based on the description of the data, 'DAILY_YIELD' and 'TOTAL_YIELD' are just power integrated over time.
# These are not needed because we already have power.
features = ['DC_POWER', 'AC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']
# Pivot for easier analysis
piv_source_df = source_df.merge(source_ids, on="SOURCE_KEY")\
.pivot_table(index="DATE_TIME", columns=['source_id'], values=features)\
.reset_index()\
.sort_values('DATE_TIME')
# Flatten column names
piv_source_df.columns = [f"{str(col[1])}{col[0]}" for col in piv_source_df.columns.values]
piv_source_df["dt"] = (piv_source_df["DATE_TIME"] - piv_source_df["DATE_TIME"].shift(1)).apply(lambda x: x.total_seconds())
piv_source_df.dropna().groupby(["dt"]).size() # There are samples missing
# Interpolate over missing values
# Generate desired timestamps. Data is already recorded at 15 minutes, so 15 minute periods work naturally
resampled_ts = pd.DataFrame(pd.date_range(start=piv_source_df["DATE_TIME"].min(),
end=piv_source_df["DATE_TIME"].max(),
freq="15min",
name="DATE_TIME"))
# Will not interpolate over a period longer than 5*15min (==1:15:00)
limit = 5
clean_df = piv_source_df.merge(resampled_ts, on="DATE_TIME", how="outer")\
.interpolate(limit=limit)\
.sort_values("DATE_TIME")\
.reset_index()
# Timesteps are now always 15 minutes
clean_df["dt"] = (clean_df["DATE_TIME"] - clean_df["DATE_TIME"].shift(1)).apply(lambda x: x.total_seconds())
print(clean_df["dt"].drop_duplicates())
del clean_df["dt"]
# Can now set DATE_TIME as index
clean_df.set_index("DATE_TIME", inplace=True)
# Look for areas where dimensionality reduction could be applied
correlation = clean_df.corr()
high_corr = set()
for col in correlation.columns:
for i, row in correlation.iterrows():
if row[col] >= 0.9 and i != col:
high_corr.add((tuple(sorted([i, col])), row[col]))
# We can see that this high-dimensional data could be reduced to a few dimensions
high_corr
# Use the average of the AC_POWER as the solar panel power output metric
ac_power_cols = [col for col in clean_df.columns if "AC_POWER" in col]
clean_df["avg_ac_power"] = clean_df[ac_power_cols].mean(axis=1)
# Use the average of the DC_POWER as the inverter power output metric
dc_power_cols = [col for col in clean_df.columns if "DC_POWER" in col]
clean_df["avg_dc_power"] = clean_df[dc_power_cols].mean(axis=1)
clean_df["avg_dc_power"].plot()
# Remove AC power columns, as they are redundant with DC power, except DC power is affected by inverter efficiency
# Now have a reduced data set for faster analysis
df = clean_df.drop(columns=(ac_power_cols + dc_power_cols + ["index"]))
df.columns = [col.replace('11', '').lower() for col in df.columns]
import plotly.express as px
# The plot shows that irradiation is a much better predictor of solar panel output than ambient temperature
# It also shows that there are many outliers, where power output is much lower than typical
fig = px.scatter_3d(df, x='irradiation', y='ambient_temperature', z='avg_ac_power')
fig.show()
df.plot(x='irradiation', y ='avg_ac_power' )
df.plot(x='ambient_temperature', y ='avg_ac_power' )
# Use a regression model
# The time-dependent effects can be ignored if the time delay of the effect of temperature and irradiation is << 15 minutes (the sample rate)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
# Using Huber regression to lessen the effect of the outliers
from sklearn.linear_model import HuberRegressor
features = ['irradiation']
target = ['avg_ac_power']
X = df [features]
y = df [target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
regressor = HuberRegressor()
regressor.fit(X_train, y_train)
y_prediction = regressor.predict(X_test)
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
predict_df = y_test.copy()
predict_df['predict'] = y_prediction
predict_df.plot()
# Performance using linear regression is good enough
RMSE
df["panel_eff_metric"] = df["avg_ac_power"]/df["irradiation"]
# Drop NaNs due to 0 irradiation
q2_df = df[df["irradiation"] != 0]
q2_df.plot(x='irradiation', y ='panel_eff_metric')
# Using polynomial fit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
features = ['ambient_temperature']
target = ['panel_eff_metric']
X = q2_df[features]
y = q2_df[target]
scale = StandardScaler()
X_scaled = scale.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33)
poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(X_train)
poly_reg.fit(X_poly,y_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)
y_prediction = regressor.predict(poly_reg.fit_transform(X_test))
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
predict_df = y_test.copy()
predict_df['predict'] = y_prediction
predict_df.plot()
predict_df.plot(x='')
# Performance using linear regression is poor
RMSE
# Age of the inverter/solar panel would be an important feature for determining degradation over time
# Module temperature could also influence inverter efficiency
df["age"] = df.index - df.index[0]
df["age"] = df["age"].apply(lambda x: x.total_seconds())
df["inv_eff_metric"] = df["avg_dc_power"]/df["avg_ac_power"]
Questions: